Load Packages
library(rvest)
library(tidyverse)
library(genius)
library(tidytext)
library(dplyr)
library(ggplot2)
library(qdap)
library(mosaic)
library(lubridate)
library(data.table)
library(gridExtra)
Data Access
# read webpage for Grammy Awards
webpage <- read_html("https://en.wikipedia.org/wiki/Grammy_Award_for_Record_of_the_Year")
# copy xpath for table of 1980 to 2019
XPATH1980 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[5]'
XPATH1990 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[6]'
XPATH2000 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[7]'
XPATH2010 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[8]'
# run the following to create table of songs from 1980 to 2019
# crate table of song in 1980s
table_1980 <-
webpage %>%
html_nodes(xpath = XPATH1980) %>%
html_table(fill = TRUE)
d1980 <- table_1980[[1]]
d1980
# crate table of songs in 1990s
table_1990 <-
webpage %>%
html_nodes(xpath = XPATH1990) %>%
html_table(fill = TRUE)
d1990 <- table_1990[[1]]
d1990
# crate table of songs in 2000s
table_2000 <-
webpage %>%
html_nodes(xpath = XPATH2000) %>%
html_table(fill = TRUE)
d2000 <- table_2000[[1]]
d2000
# crate table of songs in 2010s
table_2010 <-
webpage %>%
html_nodes(xpath = XPATH2010) %>%
html_table(fill = TRUE)
d2010 <- table_2010[[1]]
d2010
NA
Data Wrangling
# combine the lyrics in 1980s and 1990s
table_19801990 <-
d1980 %>%
full_join( d1990 )
Joining, by = c("Year[I]", "Record", "Artist(s)", "Production team")
table_19801990
# combine the songs in 1980s, 1990s and 2000s
table_198019902000 <-
table_19801990 %>%
full_join( d2000 )
Joining, by = c("Year[I]", "Record", "Artist(s)", "Production team")
table_198019902000
# combine the songs in 1980s, 1990s, 2000s and 2010s.
# delete NA
table1980to2010 <-
table_198019902000 %>%
full_join( d2010 ) %>%
na.omit()
Joining, by = c("Year[I]", "Record", "Artist(s)", "Production team")
table1980to2010
# delete the column of Production team
table1980to2010$`Production team` <- NULL
table1980to2010
# change the name of songs in 1980s to 2010s
names(table1980to2010) <- c("Year","track","artis")
table1980to2010
# delete the [] after each year
table1980to2010$Year <-
substr(table1980to2010$Year,1,4)
table1980to2010
NA
# add lyrics
# using genius package to run the add_genius() function
lyrics1980to2010 <- table1980to2010 %>%
add_genius(artis, track, type = "lyrics")
# see the stop_words that in tidytext package
stop_words <-
tidytext::stop_words
stop_words
# make the lyrics to the single words, because if we do not change the lyrics to the single words, I cannot filter out the stop words.
# Using tidytext package to run the unnest_tokens() function
verse_words <- lyrics1980to2010%>%
unnest_tokens(word, lyric)
verse_words
# filter out the words that in the tidytext package
ft <- verse_words %>%
anti_join(stop_words)
Joining, by = "word"
# filter out the other 11 stop words
ft1 <- ft %>%
filter(!word %in% c("ba", "du", "yeah", "da", "ya", "ooh", "gonna", "na", "uh", "la", "hol"))
ft1
Because we cannot filter out some certain words that in a whole lyrics, so we need to spread these lyrics and make them to a single word, which need to complete by unnest_tokens(word, lyric). After that, we need to filter stop words that in tidytext package, and we need to use anti_join(stop_words) to finish that. After filter the stop words that in the tidytext package, we need to filter another 11 stopwords, so we need to use the filter function and use the !, which means inverse, so the function that we need to use is filter(!word %in% c(“ba”, “du”, “yeah”, “da”, “ya”, “ooh”, “gonna”, “na”, “uh”, “la”, “hol”)). After doing these 3 steps, we are successfully filter all stop words from the lyrics.
Data Visualization
Graph 1
# Filter out the words that used in the lyrics between 1980 and 1989 years and then make a new column in order to make the words that used in the lyrics between 1980 and 1989 to 1980s, because 1980 to 1989 years all belong to 1980s
# Using dplyr package to run the filter() and mutate() function
lyrics1980s <-
verse_words %>%
filter(Year %in% (1980:1989)) %>%
mutate(decade = "1980s")
lyrics1980s
# Filter out the words that used in the lyrics between 1990 and 1999 years and then make a new column in order to make the words that used in the lyrics between 1990 and 1999 to 1990s, because 1990 to 1999 years all belong to 1990s
# Using dplyr package to run the filter() and mutate() function
lyrics1990s <-
verse_words %>%
filter(Year %in% (1990:1999)) %>%
mutate(decade = "1990s")
lyrics1990s
# Filter out the words that used in the lyrics between 2000 and 2009 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2009 to 2000s, because 2000 to 2009 years all belong to 2000s
# Using dplyr package to run the filter() and mutate() function
lyrics2000s <-
verse_words %>%
filter(Year %in% (2000:2009)) %>%
mutate(decade = "2000s")
lyrics2000s
# Filter out the words that used in the lyrics between 2010 and 2019 years and then make a new column in order to make the words that used in the lyrics between 2010 and 2019 to 2010s, because 2010 to 2019 years all belong to 2010s
# Using dplyr package to run the filter() and mutate() function
lyrics2010s <-
verse_words %>%
filter(Year %in% (2010:2019)) %>%
mutate(decade = "2010s")
lyrics2010s
# join the lyrics that in 1980s and 1990s
# Using dplyr package to run the full_join() package
lyrics1980sto1990s <-
lyrics1980s %>%
full_join(lyrics1990s)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
lyrics1980sto1990s
# make lyrics 1980s, 1990s and 2000s together
# Using dplyr package to run the full_join() package
lyrics1980sto2000s <-
lyrics1980sto1990s %>%
full_join(lyrics2000s)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
lyrics1980sto2000s
# make lyrics that are in 1980s, 1990s, 2000s, 2010s together
# Using dplyr package to run the full_join() function
lyrics1980sto2010s <-
lyrics1980sto2000s %>%
full_join(lyrics2010s)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
lyrics1980sto2010s
# group by the track and the decade, and count the n, which is the number of the words, and then we can know Words per Grammy Nominated Song by Decade
# Using dplyr package to run the group_by() function
lyrics1980sto2010s =
lyrics1980sto2010s %>%
group_by(track, decade) %>%
count %>%
ungroup
lyrics1980sto2010s %>% nrow
[1] 174
lyrics1980sto2010s
NA
# Using data in lyrics1980sto2010s to draw the graph that shows Words per Grammy Nominated Song by Decade
# Using ggplot package to run the ggplot() and geom_boxplot() function
lyrics1980sto2010s %>%
ggplot(aes(x = decade, y = n, fill = decade))+
geom_boxplot()+
labs(x = "Decade", y = "Words per Song", title = "Boxplots of Words per Grammy Nominated Song by Decade")

NA
From the first graph, we can know the words per song for each decade. In 1980s (1980-1989), the median of number of words per song is abut 300. In 1990s (1990-1999), the median of number of words per song is about 290. In 2000s (2000-2009), the median of number of words per song is about 375. In 2010s (2010-2019), the median of number of words per song is about 380. Therefore, by looking at the first graph, people can know how the words per song in each decade changed, and it is really interesting.
Graph 2
# count the words and use top_n(10) to know the ten most popular words of Grammy Nominated Songs from 1980 - 2019 (except all the stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph2topten <- ft1 %>%
count(word, sort = TRUE) %>%
filter(n >= 3) %>%
#filter(word != 'em') %>% #Review topten and filter words missed by stop_words
top_n(10)
Selecting by n
Graph2topten
# Using the data in Graph2topten to draw the graph that show the ten most popular words of Grammy Nominated Songs from 1980 - 2019
# Using ggplot package to run the ggplot() and geom_col() function
Graph2topten %>%
ggplot(aes(x = reorder(word, -n), y = n)) +
geom_col(fill = "pink")+
labs(y = "Count", x = "Word", title = "Ten Most Popular Words of Grammy Nominated Songs from 1980 - 2019" )

For the second graph, we can know the ten most popular words of Grammy Nominated Songs from 1980 - 2019 (except stop words). It means that we can know which words are popular in Grammy Nominated Songs from 1980 - 2019. By looking the graph, we can know that the word “love” is the most popular words of Grammy Nominated Songs from 1980 - 2019, and it is used more than 500 times!
Graph 3
# Filter out the words that used in the lyrics between 1980 and 1989 years and then make a new column in order to make the words that used in the lyrics between 1980 and 1989 to 1980s, because 1980 to 1989 years all belong to 1980s (except stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics1980s <-
ft1 %>%
filter(Year %in% (1980:1989)) %>%
mutate(decade = "1980s")
newlyrics1980s
# Filter out the words that used in the lyrics between 1990 and 1999 years and then make a new column in order to make the words that used in the lyrics between 1990 and 1999 to 1990s, because 1990 to 1999 years all belong to 1990s (except all stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics1990s <-
ft1 %>%
filter(Year %in% (1990:1999)) %>%
mutate(decade = "1990s")
newlyrics1990s
# Filter out the words that used in the lyrics between 2000 and 2009 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2009 to 2000s, because 2000 to 2009 years all belong to 2000s (except all stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics2000s <-
ft1 %>%
filter(Year %in% (2000:2009)) %>%
mutate(decade = "2000s")
newlyrics2000s
# Filter out the words that used in the lyrics between 2010 and 2019 years and then make a new column in order to make the words that used in the lyrics between 2010 and 2019 to 2010s, because 2010 to 2019 years all belong to 2010s (except all stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics2010s <-
ft1 %>%
filter(Year %in% (2010:2019)) %>%
mutate(decade = "2010s")
newlyrics2010s
# make words in lyrics that are in 1980s and 1990s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics1980sto1990s <-
newlyrics1980s %>%
full_join(newlyrics1990s)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
newlyrics1980sto1990s
# make words in lyrics that are in 1980s, 1990s and 2000s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics1980sto2000s <-
newlyrics1980sto1990s %>%
full_join(newlyrics2000s)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
newlyrics1980sto2000s
# make words in lyrics that are in 1980s, 1990s, 2000s and 2010s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics1980sto2010s <-
newlyrics1980sto2000s %>%
full_join(newlyrics2010s)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
newlyrics1980sto2010s
# know the top 10 words that used in 1980s
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten80s <- newlyrics1980s %>%
count(word, sort = TRUE) %>%
filter(n >= 3) %>%
#filter(word != 'em') %>% #Review topten and filter words missed by stop_words
top_n(10)
Selecting by n
Graph3topten80s
# know the top 10 words that used in 1990s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten90s <- newlyrics1990s %>%
count(word, sort = TRUE) %>%
filter(n >= 3) %>%
#filter(word != 'em') %>% #Review topten and filter words missed by stop_words
top_n(10)
Selecting by n
Graph3topten90s
# know the top 10 words that used in 2000s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten00s <- newlyrics2000s %>%
count(word, sort = TRUE) %>%
filter(n >= 3) %>%
#filter(word != 'em') %>% #Review topten and filter words missed by stop_words
top_n(10)
Selecting by n
Graph3topten00s
# know the top 10 words that used in 2010s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten10s <- newlyrics2010s %>%
count(word, sort = TRUE) %>%
filter(n >= 3) %>%
#filter(word != 'em') %>% #Review topten and filter words missed by stop_words
top_n(10)
Selecting by n
Graph3topten10s
# draw a graph that can show the top 10 words in 1980s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.1 <-
Graph3topten80s %>%
ggplot(aes(x = reorder(word, -n), y = n)) +
geom_col(fill = "Blue")+
labs(y = "Count", x = "Word", title = "1980s" )
# draw a graph that can show the top 10 words in 1990s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.2 <-
Graph3topten90s %>%
ggplot(aes(x = reorder(word, -n), y = n)) +
geom_col(fill = "pink")+
labs(y = "Count", x = "Word", title = "1990s" )
# draw a graph that can show the top 10 words in 2000s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.3 <-
Graph3topten00s %>%
ggplot(aes(x = reorder(word, -n), y = n)) +
geom_col(fill = "red")+
labs(y = "Count", x = "Word", title = "2000s" )
# draw a graph that can show the top 10 words in 2010s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.4 <-
Graph3topten10s %>%
ggplot(aes(x = reorder(word, -n), y = n)) +
geom_col(fill = "orange")+
labs(y = "Count", x = "Word", title = "2010s" )
# Combine the 4 graphs that created before, and combine then together by using grid.arrange() function, and then people can know the top ten words by secade (except stop words)
# Using grid.Extra package to run the grid.arrange() function
Graph3 <-
grid.arrange(Graph3.1, Graph3.2, Graph3.3, Graph3.4, top = "Top Ten Words by Decade")

For the third graph, we can know top ten words by decade. In 1980s (1980-1989), we can know the words “love” is the most popular words in the Grammy Nominated Songs from 1980-1989, and it has been used more than 150 times. From these four graphs, people can know that the word “love” is the most popular words in all 4 decades!
Graph 4
# crate a same data table but different name, because I affraid I got something run and need to run the code from the beginning
newlyrics1980sto2010sa <-
newlyrics1980sto2010s
# know the different sentiments based on the different words in the lyrics that in 1980s to 2010s
# Using dplyr package to run the inner_join() and mutate() function
# Using mosaic package to run the count() function
# Using tidyr package to run the spread() function
sentiment <-
newlyrics1980sto2010sa %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment, word) %>%
spread(sentiment, n, fill = 0) %>%
mutate(polarity = positive - negative)
Joining, by = "word"
sentiment
# know the total words' sentiments in different year
# Using dplyr package to run the mutate() function
# Using mosaic package to run the count() function
# Using tidyr package to run the spread() function
bing <- get_sentiments("bing")
lyricssentiment <- newlyrics1980sto2010sa %>% inner_join(bing) %>%
count(Year, sentiment) %>%
spread(sentiment, n , fill=0) %>%
mutate(sentiment = positive - negative)
Joining, by = "word"
lyricssentiment
# filter out all the negative sentiment, and use positive sentiment
# Using dplyr package to run the filter() function
positive_senti <-
get_sentiments("bing") %>%
filter(sentiment == "positive")
positive_senti
# Know the total positive words' sentiment in different year
# Using dplyr package to run the semi_join() function
# Using mosaic package to run the count() function
lyricssentiment <-
newlyrics1980sto2010sa %>%
semi_join(positive_senti) %>%
count(Year, sort = TRUE)
Joining, by = "word"
lyricssentiment
# Crate a new table that only include the total positive sentiments in 1980 to 1989 and crate a new column called decade, because they all belong to 1980s
# Using dplyr package to run the filter() and mutate() function
senti80s <-
lyricssentiment %>%
filter(Year %in% (1980:1989)) %>%
mutate(decade = "1980s")
senti80s
# Crate a new table that only include the total positive sentiments in 1990 to 1999 and crate a new column called decade, because they all belong to 1990s
# Using dplyr package to run the filter() and mutate() function
senti90s <-
lyricssentiment %>%
filter(Year %in% (1990:1999)) %>%
mutate(decade = "1990s")
senti90s
# Crate a new table that only include the total positive sentiments in 2000 to 2009 and crate a new column called decade, because they all belong to 2000s
# Using dplyr package to run the filter() and mutate() function
senti00s <-
lyricssentiment %>%
filter(Year %in% (2000:2009)) %>%
mutate(decade = "2000s")
senti00s
# Crate a new table that only include the total positive sentiments in 2010 to 2019 and crate a new column called decade, because they all belong to 2010s
# Using dplyr package to run the filter() and mutate() function
senti10s <-
lyricssentiment %>%
filter(Year %in% (2010:2019)) %>%
mutate(decade = "2010s")
senti10s
# make 1980s and 1990s together
# Using dplyr package to run the full_join() function
senti8090 <-
senti80s %>%
full_join(senti90s)
Joining, by = c("Year", "n", "decade")
senti8090
# make 1980s, 1990s, and 2000s together
# Using dplyr package to run the full_join() function
senti809000 <-
senti8090 %>%
full_join(senti00s)
Joining, by = c("Year", "n", "decade")
senti809000
# make 1980s, 1990s, 2000s, and 2010s together
# Using dplyr package to run the full_join() function
lyricssentiment <-
senti809000 %>%
full_join(senti10s)
Joining, by = c("Year", "n", "decade")
lyricssentiment
# Filter out the words that used in the lyrics between 1980 and 1989 years and then make a new column in order to make the words that used in the lyrics between 1980 and 1989 to 1980s, because 1980 to 1989 years all belong to 1980s
# Using dplyr package to run the filter() and mutate() function
newlyrics1980snew <- ft1 %>%
filter(Year %in% (1980:1989)) %>%
mutate(decade = "1980s")
newlyrics1980snew
# Filter out the words that used in the lyrics between 1990 and 1999 years and then make a new column in order to make the words that used in the lyrics between 1990 and 1999 to 1990s, because 1980 to 1989 years all belong to 1990s
# Using dplyr package to run the filter() and mutate() function
newlyrics1990snew <- ft1 %>%
filter(Year %in% (1990:1999)) %>%
mutate(decade = "1990s")
newlyrics1990snew
# Filter out the words that used in the lyrics between 2000 and 2009 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2009 to 2000s, because 1980 to 1989 years all belong to 2000s
# Using dplyr package to run the filter() and mutate() function
newlyrics2000snew <- ft1 %>%
filter(Year %in% (2000:2009)) %>%
mutate(decade = "2000s")
newlyrics2000snew
# Filter out the words that used in the lyrics between 2010 and 2019 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2019 to 2010s, because 1980 to 1989 years all belong to 2010s
# Using dplyr package to run the filter() and mutate() function
newlyrics2010snew <- ft1 %>%
filter(Year %in% (2010:2019)) %>%
mutate(decade = "2010s")
newlyrics2010snew
# make words in lyrics that are in 1980s and 1990s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics80sto90snew <- newlyrics1980snew %>%
full_join(newlyrics1990snew)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
newlyrics80sto90snew
# make words in lyrics that are in 1980s, 1990s and 2000s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics80sto00snew <- newlyrics80sto90snew %>%
full_join(newlyrics2000snew)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
newlyrics80sto00snew
# make words in lyrics that are in 1980s, 1990s, 2000s and 2010s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics80sto10snew <- newlyrics80sto00snew %>%
full_join(newlyrics2010snew)
Joining, by = c("Year", "track", "artis", "track_title", "line", "word", "decade")
newlyrics80sto10snew
# join the last table with the sentiment
# Using dplyr package to run the full_join() function
ft1sentiment <- newlyrics80sto10snew %>%
full_join(sentiments)
Joining, by = "word"
ft1sentiment
# make the sentiment that is negative to 0 and positive to 1
ft1sentiment_new <-
within(ft1sentiment_new, sentiment <- factor(sentiment, labels = c(0,1))) %>%
na.omit()
ft1sentiment_new
# filter out all the 0 (negative sentiment), and use all the 1 (positive sentiment)
# Using dplyr package to run the filter() function
ft1sentiment_new_net <-
ft1sentiment_new %>%
filter(sentiment == "1")
ft1sentiment_new_net
# Draw a graph that shows Net Sentiment Score by Year
# Using ggplot package to run the ggplot(), geom_bar() and theme() function
ggplot(ft1sentiment_new_net) +
aes(x = Year, fill = decade) +
geom_bar() +
labs(x = "Year", y = "Net Sentiment", title = "Net Sentiment Score by Year") +
theme(axis.text = element_text(angle = 30))

From fourth graph, we can know the net sentiment score by year. By looking the graph, we can know that between 1980 and 2019, lyrics in 2016 has the highest net sentiment score, and the net sentiment score is more than 100, and lyrics in 2011 has the lowest net sentiment score, and the net sentiment score is nearly 5.
Graph 5
# calculate the mean of the sentiment in 1980s
# Using mosaic package to run the mean() function
mean(senti80s$n)
[1] 49.1
# calculate the mean of the sentiment in 1990s
# Using mosaic package to run the mean() function
mean(senti90s$n)
[1] 35
# calculate the mean of the sentiment in 2000s
# Using mosaic package to run the mean() function
mean(senti00s$n)
[1] 34.1
# calculate the mean of the sentiment in 2010s
# Using mosaic package to run the mean() function
mean(senti10s$n)
[1] 41.5
# create a new data frame that include the mean of the sentiment in different decade
Meansenti <-
data.frame (first_column = c("1980s", "1990s", "2000s", "2010s") ,second_column = c("49.1", "35", "34.1", "41.5"))
names(Meansenti) <- c("decade","mean")
Meansenti
# Draw a graph that can show the Mean Sentiment Score by Decade
# Using ggplot package to run the ggplot(), geom_col() function
ggplot(Meansenti, aes(decade, mean, fill = decade)) +
geom_col() +
labs(y = "Mean Sentiment Score", x = "Decade", title = "Mean Sentiment Score by Decade" )

From the fifth graph, we can know the mean sentiment score by decade. By looking the graph, we can know that between 1980s and 2010s, the lyrics in 1980s has the highest mean sentiment score, and the lyrics in 2000s has the lowest mean sentiment score, their difference are really big, and about 15.
Graph 6
plot(lyricssentiment$Year, lyricssentiment$n, main = "Net Sentiment Score by Year of Grammy Nominated Records from 1980 - 2019 with Linear Model Fit", ylab = "Net Sentiment", xlab = "Year")

# draw a graph that can show Net Sentiment Score by Year of Grammy Nominated Records from 1980 - 2019 with Linear Model Fit
# Using ggplot package to run the ggplot(), ggtitle(), geom_smooth(), xlab() and ylab() function
ggplot(lyricssentiment, aes(x = Year, y = n, color = decade)) + geom_point() + geom_smooth() +
ggtitle("Net Sentiment Score by Year of Grammy Nominated Records from 1980 - 2019 with Linear Model Fit") +
xlab("Year") +
ylab("Net Sentiment") +
geom_smooth(aes(x = Year, y = n), method = "lm", se = FALSE, inherit.aes = FALSE, colour = "black", size = 1)

From the sixth graph, we can know the net sentiment score by year of Grammy Nominated Records from 1980 - 2019. By looking the graph, we can easily to know how the net sentiment change in different years, and by looking the fit line, we can know that the net sentiment has become higher from 2010s.
---
title: "Final Project"
output: html_notebook
date: "12/14/2020"
author: "Tingyu Qian"
---

### Load Packages
```{r}
library(rvest)
library(tidyverse)
library(genius)
library(tidytext)
library(dplyr)
library(ggplot2)
library(qdap)
library(mosaic)
library(lubridate)
library(data.table)
library(gridExtra)
```

### Data Access
```{r}
# read webpage for Grammy Awards
webpage <- read_html("https://en.wikipedia.org/wiki/Grammy_Award_for_Record_of_the_Year")

# copy xpath for table of 1980 to 2019
XPATH1980 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[5]'
XPATH1990 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[6]'
XPATH2000 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[7]'
XPATH2010 <- '/html/body/div[3]/div[3]/div[5]/div[1]/table[8]'

# run the following to create table of songs from 1980 to 2019
# crate table of song in 1980s
table_1980 <- 
  webpage %>%
  html_nodes(xpath = XPATH1980) %>%
  html_table(fill = TRUE)
d1980 <- table_1980[[1]]
d1980

# crate table of songs in 1990s
table_1990 <- 
  webpage %>%
  html_nodes(xpath = XPATH1990) %>%
  html_table(fill = TRUE)
d1990 <- table_1990[[1]]
d1990

# crate table of songs in 2000s
table_2000 <- 
  webpage %>%
  html_nodes(xpath = XPATH2000) %>%
  html_table(fill = TRUE)
d2000 <- table_2000[[1]]
d2000

# crate table of songs in 2010s
table_2010 <- 
  webpage %>%
  html_nodes(xpath = XPATH2010) %>%
  html_table(fill = TRUE)
d2010 <- table_2010[[1]]
d2010

```

### Data Wrangling

```{r}
# combine the songs in 1980s and 1990s
# using dplyr package to run the function full_join()
table_19801990 <- 
  d1980 %>%
  full_join( d1990 )
table_19801990
```

```{r}
# combine the songs in 1980s, 1990s and 2000s
# using dplyr package to run the function full_join()
table_198019902000 <- 
  table_19801990 %>%
  full_join( d2000 )
table_198019902000
```

```{r}
# combine the songs in 1980s, 1990s, 2000s and 2010s.
# delete NA
# using dplyr package to run the function full_join()
table1980to2010 <- 
  table_198019902000 %>%
  full_join( d2010 ) %>%
  na.omit() 
table1980to2010
```

```{r}
# delete the column of Production team
table1980to2010$`Production team` <- NULL 
table1980to2010
```

```{r}
# change the name of songs in 1980s to 2010s
names(table1980to2010) <- c("Year","track","artis") 
table1980to2010
```

```{r}
# delete the [] after each year
table1980to2010$Year <- 
  substr(table1980to2010$Year,1,4) 
table1980to2010
  
```

```{r}
# add lyrics 
# using genius package to run the add_genius() function
lyrics1980to2010 <- table1980to2010 %>%
  add_genius(artis, track, type = "lyrics")
```

```{r}
# see the stop_words that in tidytext package
# using tidytext package to know the stopwords
stop_words <-
  tidytext::stop_words
stop_words
```


```{r}
# make the lyrics to the single words, because if we do not change the lyrics to the single words, I cannot filter out the stop words.
# Using tidytext package to run the unnest_tokens() function
verse_words <- lyrics1980to2010%>%
  unnest_tokens(word, lyric)
verse_words
```

```{r}
# filter out the words that in the tidytext package
# Using dplyr package to run the anti_join() function
ft <- verse_words %>%
  anti_join(stop_words)

# filter out the other 11 stop words
# Using dplyr package to run the filter() function
ft1 <- ft %>%
  filter(!word %in% c("ba", "du", "yeah", "da", "ya", "ooh", "gonna", "na", "uh", "la", "hol"))
ft1
```
Because we cannot filter out some certain words that in a whole lyrics, so we need to spread these lyrics and make them to a single word, which need to complete by unnest_tokens(word, lyric). After that, we need to filter stop words that in tidytext package, and we need to use anti_join(stop_words) to finish that. After filter the stop words that in the tidytext package, we need to filter another 11 stopwords, so we need to use the filter function and use the !, which means inverse, so the function that we need to use is filter(!word %in% c("ba", "du", "yeah", "da", "ya", "ooh", "gonna", "na", "uh", "la", "hol")). After doing these 3 steps, we are successfully filter all stop words from the lyrics.

### Data Visualization

### Graph 1

```{r}
# Filter out the words that used in the lyrics between 1980 and 1989 years and then make a new column in order to make the words that used in the lyrics between 1980 and 1989 to 1980s, because 1980 to 1989 years all belong to 1980s
# Using dplyr package to run the filter() and mutate() function
lyrics1980s <-
  verse_words %>%
  filter(Year %in% (1980:1989)) %>%
  mutate(decade = "1980s")
lyrics1980s
```

```{r}
# Filter out the words that used in the lyrics between 1990 and 1999 years and then make a new column in order to make the words that used in the lyrics between 1990 and 1999 to 1990s, because 1990 to 1999 years all belong to 1990s
# Using dplyr package to run the filter() and mutate() function
lyrics1990s <-
  verse_words %>%
  filter(Year %in% (1990:1999)) %>%
  mutate(decade = "1990s")
lyrics1990s
```

```{r}
# Filter out the words that used in the lyrics between 2000 and 2009 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2009 to 2000s, because 2000 to 2009 years all belong to 2000s
# Using dplyr package to run the filter() and mutate() function
lyrics2000s <-
  verse_words %>%
  filter(Year %in% (2000:2009)) %>%
  mutate(decade = "2000s")
lyrics2000s
```

```{r}
# Filter out the words that used in the lyrics between 2010 and 2019 years and then make a new column in order to make the words that used in the lyrics between 2010 and 2019 to 2010s, because 2010 to 2019 years all belong to 2010s
# Using dplyr package to run the filter() and mutate() function
lyrics2010s <-
  verse_words %>%
  filter(Year %in% (2010:2019)) %>%
  mutate(decade = "2010s")
lyrics2010s
```

```{r}
# join the lyrics that are in 1980s and 1990s
# Using dplyr package to run the full_join() function
lyrics1980sto1990s <-
  lyrics1980s %>%
  full_join(lyrics1990s)
lyrics1980sto1990s
```

```{r}
# make lyrics that are in 1980s, 1990s and 2000s together
# Using dplyr package to run the full_join() function
lyrics1980sto2000s <-
  lyrics1980sto1990s %>%
  full_join(lyrics2000s)
lyrics1980sto2000s
```

```{r}
# make lyrics that are in 1980s, 1990s, 2000s, 2010s together
# Using dplyr package to run the full_join() function
lyrics1980sto2010s <-
  lyrics1980sto2000s %>%
  full_join(lyrics2010s)
lyrics1980sto2010s
```


```{r}
# group by the track and the decade, and count the n, which is the number of the words, and then we can know Words per Grammy Nominated Song by Decade
# Using dplyr package to run the group_by() function
lyrics1980sto2010s =
  lyrics1980sto2010s %>%
  group_by(track, decade) %>%
  count %>%
  ungroup

lyrics1980sto2010s %>% nrow

lyrics1980sto2010s

```


```{r}
# Using data in lyrics1980sto2010s to draw the graph that shows Words per Grammy Nominated Song by Decade
# Using ggplot package to run the ggplot() and geom_boxplot() function
lyrics1980sto2010s %>%
  ggplot(aes(x = decade, y = n, fill = decade))+
  geom_boxplot()+
  labs(x = "Decade", y = "Words per Song", title = "Boxplots of Words per Grammy Nominated Song by Decade")
 
```
From the first graph, we can know the words per song for each decade. In 1980s (1980-1989), the median of number of words per song is abut 300. In 1990s (1990-1999), the median of number of words per song is about 290. In 2000s (2000-2009), the median of number of words per song is about 375. In 2010s (2010-2019), the median of number of words per song is about 380. Therefore, by looking at the first graph, people can know how the words per song in each decade changed, and it is really interesting.

### Graph 2

```{r}
# count the words and use top_n(10) to know the ten most popular words of Grammy Nominated Songs from 1980 - 2019 (except all the stop words)
Graph2topten <- ft1 %>%
  count(word, sort = TRUE) %>%
  filter(n >= 3) %>%
  #filter(word != 'em') %>% #Review topten and filter words missed by stop_words
  top_n(10)
Graph2topten
```

```{r}
# Using the data in Graph2topten to draw the graph that show the ten most popular words of Grammy Nominated Songs from 1980 - 2019
# Using ggplot package to run the ggplot() and geom_col() function
Graph2topten %>% 
  ggplot(aes(x = reorder(word, -n), y = n)) +
  geom_col(fill = "pink")+
  labs(y = "Count", x = "Word", title = "Ten Most Popular Words of Grammy Nominated Songs from 1980 - 2019" )

```
For the second graph, we can know the ten most popular words of Grammy Nominated Songs from 1980 - 2019 (except stop words). It means that we can know which words are popular in Grammy Nominated Songs from 1980 - 2019. By looking the graph, we can know that the word "love" is the most popular words of Grammy Nominated Songs from 1980 - 2019, and it is used more than 500 times!


### Graph 3
```{r}
# Filter out the words that used in the lyrics between 1980 and 1989 years and then make a new column in order to make the words that used in the lyrics between 1980 and 1989 to 1980s, because 1980 to 1989 years all belong to 1980s (except stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics1980s <-
  ft1 %>%
  filter(Year %in% (1980:1989)) %>%
  mutate(decade = "1980s")
newlyrics1980s
```

```{r}
# Filter out the words that used in the lyrics between 1990 and 1999 years and then make a new column in order to make the words that used in the lyrics between 1990 and 1999 to 1990s, because 1990 to 1999 years all belong to 1990s (except all stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics1990s <-
  ft1 %>%
  filter(Year %in% (1990:1999)) %>%
  mutate(decade = "1990s")
newlyrics1990s
```

```{r}
# Filter out the words that used in the lyrics between 2000 and 2009 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2009 to 2000s, because 2000 to 2009 years all belong to 2000s (except all stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics2000s <-
  ft1 %>%
  filter(Year %in% (2000:2009)) %>%
  mutate(decade = "2000s")
newlyrics2000s
```

```{r}
# Filter out the words that used in the lyrics between 2010 and 2019 years and then make a new column in order to make the words that used in the lyrics between 2010 and 2019 to 2010s, because 2010 to 2019 years all belong to 2010s (except all stop words)
# Using dplyr package to run the filter() and mutate() function
newlyrics2010s <-
  ft1 %>%
  filter(Year %in% (2010:2019)) %>%
  mutate(decade = "2010s")
newlyrics2010s
```

```{r}
# make words in lyrics that are in 1980s and 1990s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics1980sto1990s <-
  newlyrics1980s %>%
  full_join(newlyrics1990s)
newlyrics1980sto1990s
```

```{r}
# make words in lyrics that are in 1980s, 1990s and 2000s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics1980sto2000s <-
  newlyrics1980sto1990s %>%
  full_join(newlyrics2000s)
newlyrics1980sto2000s
```

```{r}
# make words in lyrics that are in 1980s, 1990s, 2000s and 2010s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics1980sto2010s <-
  newlyrics1980sto2000s %>%
  full_join(newlyrics2010s)
newlyrics1980sto2010s
```

```{r}
# know the top 10 words that used in 1980s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten80s <- newlyrics1980s %>%
  count(word, sort = TRUE) %>%
  filter(n >= 3) %>%
  #filter(word != 'em') %>% #Review topten and filter words missed by stop_words
  top_n(10)
Graph3topten80s
```

```{r}
# know the top 10 words that used in 1990s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten90s <- newlyrics1990s %>%
  count(word, sort = TRUE) %>%
  filter(n >= 3) %>%
  #filter(word != 'em') %>% #Review topten and filter words missed by stop_words
  top_n(10)
Graph3topten90s
```

```{r}
# know the top 10 words that used in 2000s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten00s <- newlyrics2000s %>%
  count(word, sort = TRUE) %>%
  filter(n >= 3) %>%
  #filter(word != 'em') %>% #Review topten and filter words missed by stop_words
  top_n(10)
Graph3topten00s
```

```{r}
# know the top 10 words that used in 2010s (except all stop words)
# Using mosaic package to run the count() function
# Using dplyr package to run the filter() and top_n() function
Graph3topten10s <- newlyrics2010s %>%
  count(word, sort = TRUE) %>%
  filter(n >= 3) %>%
  #filter(word != 'em') %>% #Review topten and filter words missed by stop_words
  top_n(10)
Graph3topten10s
```

```{r}
# draw a graph that can show the top 10 words in 1980s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.1 <-
  Graph3topten80s %>% 
  ggplot(aes(x = reorder(word, -n), y = n)) +
  geom_col(fill = "Blue")+
  labs(y = "Count", x = "Word", title = "1980s" )
  
```

```{r}
# draw a graph that can show the top 10 words in 1990s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.2 <-
  Graph3topten90s %>% 
  ggplot(aes(x = reorder(word, -n), y = n)) +
  geom_col(fill = "pink")+
  labs(y = "Count", x = "Word", title = "1990s" )
  
```

```{r}
# draw a graph that can show the top 10 words in 2000s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.3 <-
  Graph3topten00s %>% 
  ggplot(aes(x = reorder(word, -n), y = n)) +
  geom_col(fill = "red")+
  labs(y = "Count", x = "Word", title = "2000s" )
  
```

```{r}
# draw a graph that can show the top 10 words in 2010s
# Using ggplot package to run the ggplot() and geom_col() function
Graph3.4 <-
  Graph3topten10s %>% 
  ggplot(aes(x = reorder(word, -n), y = n)) +
  geom_col(fill = "orange")+
  labs(y = "Count", x = "Word", title = "2010s" )
 
```

```{r}
# Combine the 4 graphs that created before, and combine then together by using grid.arrange() function, and then people can know the top ten words by decade (except stop words)
# Using grid.Extra package to run the grid.arrange() function
Graph3 <-
  grid.arrange(Graph3.1, Graph3.2, Graph3.3, Graph3.4, top = "Top Ten Words by Decade")
```
For the third graph, we can know top ten words by decade. In 1980s (1980-1989), we can know the words "love" is the most popular words in the Grammy Nominated Songs from 1980-1989, and it has been used more than 150 times. From these four graphs, people can know that the word "love" is the most popular words in all 4 decades!


### Graph 4
```{r}
# crate a same data table but different name, because I affraid I got something run and need to run the code from the beginning
newlyrics1980sto2010sa <-
  newlyrics1980sto2010s
```

```{r}
# know the different sentiments based on the different words in the lyrics that in 1980s to 2010s
# Using dplyr package to run the inner_join() and mutate() function
# Using mosaic package to run the count() function
# Using tidyr package to run the spread() function
sentiment <-
  newlyrics1980sto2010sa %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment, word) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(polarity = positive - negative) 
sentiment
```


```{r}
# know the total words' sentiments in different year
# Using dplyr package to run the mutate() function
# Using mosaic package to run the count() function
# Using tidyr package to run the spread() function
bing <- get_sentiments("bing")
lyricssentiment <- newlyrics1980sto2010sa %>% inner_join(bing) %>% 
                  count(Year, sentiment) %>%
                  spread(sentiment, n , fill=0) %>%
                  mutate(sentiment = positive - negative)
lyricssentiment
```


```{r}
# filter out all the negative sentiment, and use positive sentiment
# Using dplyr package to run the filter() function
positive_senti <-
  get_sentiments("bing") %>%
  filter(sentiment == "positive")
positive_senti

# Know the total positive words' sentiment in different year
# Using dplyr package to run the semi_join() function
# Using mosaic package to run the count() function
lyricssentiment <-
  newlyrics1980sto2010sa %>%
  semi_join(positive_senti) %>%
  count(Year, sort = TRUE)
lyricssentiment
```

```{r}
# Crate a new table that only include the total positive sentiments in 1980 to 1989 and crate a new column called decade, because they all belong to 1980s
# Using dplyr package to run the filter() and mutate() function
senti80s <-
  lyricssentiment %>%
  filter(Year %in% (1980:1989)) %>%
  mutate(decade = "1980s")
senti80s

# Crate a new table that only include the total positive sentiments in 1990 to 1999 and crate a new column called decade, because they all belong to 1990s
# Using dplyr package to run the filter() and mutate() function
senti90s <-
  lyricssentiment %>%
  filter(Year %in% (1990:1999)) %>%
  mutate(decade = "1990s")
senti90s

# Crate a new table that only include the total positive sentiments in 2000 to 2009 and crate a new column called decade, because they all belong to 2000s
# Using dplyr package to run the filter() and mutate() function
senti00s <-
  lyricssentiment %>%
  filter(Year %in% (2000:2009)) %>%
  mutate(decade = "2000s")
senti00s

# Crate a new table that only include the total positive sentiments in 2010 to 2019 and crate a new column called decade, because they all belong to 2010s
# Using dplyr package to run the filter() and mutate() function
senti10s <-
  lyricssentiment %>%
  filter(Year %in% (2010:2019)) %>%
  mutate(decade = "2010s")
senti10s
```

```{r}
# make 1980s and 1990s together
# Using dplyr package to run the full_join() function
senti8090 <-
  senti80s %>%
  full_join(senti90s)
senti8090

# make 1980s, 1990s, and 2000s together
# Using dplyr package to run the full_join() function
senti809000 <-
  senti8090 %>%
  full_join(senti00s)
senti809000

# make 1980s, 1990s, 2000s, and 2010s together
# Using dplyr package to run the full_join() function
lyricssentiment <-
  senti809000 %>%
  full_join(senti10s)
lyricssentiment
```


```{r}
# Filter out the words that used in the lyrics between 1980 and 1989 years and then make a new column in order to make the words that used in the lyrics between 1980 and 1989 to 1980s, because 1980 to 1989 years all belong to 1980s
# Using dplyr package to run the filter() and mutate() function
newlyrics1980snew <- ft1 %>%
  filter(Year %in% (1980:1989)) %>%
  mutate(decade = "1980s")
newlyrics1980snew

# Filter out the words that used in the lyrics between 1990 and 1999 years and then make a new column in order to make the words that used in the lyrics between 1990 and 1999 to 1990s, because 1980 to 1989 years all belong to 1990s
# Using dplyr package to run the filter() and mutate() function
newlyrics1990snew <- ft1 %>%
  filter(Year %in% (1990:1999)) %>%
  mutate(decade = "1990s")
newlyrics1990snew

# Filter out the words that used in the lyrics between 2000 and 2009 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2009 to 2000s, because 1980 to 1989 years all belong to 2000s
# Using dplyr package to run the filter() and mutate() function
newlyrics2000snew <- ft1 %>%
  filter(Year %in% (2000:2009)) %>%
  mutate(decade = "2000s")
newlyrics2000snew

# Filter out the words that used in the lyrics between 2010 and 2019 years and then make a new column in order to make the words that used in the lyrics between 2000 and 2019 to 2010s, because 1980 to 1989 years all belong to 2010s
# Using dplyr package to run the filter() and mutate() function
newlyrics2010snew <- ft1 %>%
  filter(Year %in% (2010:2019)) %>%
  mutate(decade = "2010s")
newlyrics2010snew

# make words in lyrics that are in 1980s and 1990s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics80sto90snew <- newlyrics1980snew %>%
  full_join(newlyrics1990snew)
newlyrics80sto90snew

# make words in lyrics that are in 1980s, 1990s and 2000s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics80sto00snew <- newlyrics80sto90snew %>%
  full_join(newlyrics2000snew)
newlyrics80sto00snew

# make words in lyrics that are in 1980s, 1990s, 2000s and 2010s together (except all stop words)
# Using dplyr package to run the full_join() function
newlyrics80sto10snew <- newlyrics80sto00snew %>%
  full_join(newlyrics2010snew)
newlyrics80sto10snew
```


```{r}
# join the last table with the sentiment
# Using dplyr package to run the full_join() function
ft1sentiment <- newlyrics80sto10snew %>%
  full_join(sentiments)
ft1sentiment
```

```{r}
# make the sentiment that is negative to 0 and positive to 1
# delete the NA
ft1sentiment_new <-
  within(ft1sentiment_new, sentiment <- factor(sentiment, labels = c(0,1))) %>%
  na.omit()
ft1sentiment_new
```

```{r}
# filter out all the 0 (negative sentiment), and use all the 1 (positive sentiment)
# Using dplyr package to run the filter() function
ft1sentiment_new_net <-
  ft1sentiment_new %>%
  filter(sentiment == "1")
ft1sentiment_new_net
```

```{r}
# Draw a graph that shows Net Sentiment Score by Year
# Using ggplot package to run the ggplot(), geom_bar() and theme() function
ggplot(ft1sentiment_new_net) +
  aes(x = Year, fill = decade) +
  geom_bar() +
  labs(x = "Year", y = "Net Sentiment", title = "Net Sentiment Score by Year") + 
  theme(axis.text = element_text(angle = 30))
```
From fourth graph, we can know the net sentiment score by year. By looking the graph, we can know that between 1980 and 2019, lyrics in 2016 has the highest net sentiment score, and the net sentiment score is more than 100, and lyrics in 2011 has the lowest net sentiment score, and the net sentiment score is nearly 5.

### Graph 5
```{r}
# calculate the mean of the sentiment in 1980s
# Using mosaic package to run the mean() function
mean(senti80s$n)
```
```{r}
# calculate the mean of the sentiment in 1990s
# Using mosaic package to run the mean() function
mean(senti90s$n)
```
```{r}
# calculate the mean of the sentiment in 2000s
# Using mosaic package to run the mean() function
mean(senti00s$n)
```
```{r}
# calculate the mean of the sentiment in 2010s
# Using mosaic package to run the mean() function
mean(senti10s$n)
```

```{r}
# create a new data frame that include the mean of the sentiment in different decade
Meansenti <-
  data.frame (first_column = c("1980s", "1990s", "2000s", "2010s") ,second_column = c("49.1", "35", "34.1", "41.5"))
names(Meansenti) <- c("decade","mean")
Meansenti
```

```{r}
# Draw a graph that can show the Mean Sentiment Score by Decade
# Using ggplot package to run the ggplot(), geom_col() function
ggplot(Meansenti, aes(decade, mean, fill = decade)) +
  geom_col() +
  labs(y = "Mean Sentiment Score", x = "Decade", title = "Mean Sentiment Score by Decade" ) 
```
From the fifth graph, we can know the mean sentiment score by decade. By looking the graph, we can know that between 1980s and 2010s, the lyrics in 1980s has the highest mean sentiment score, and the lyrics in 2000s has the lowest mean sentiment score, their difference are really big, and about 15.


### Graph 6
```{r}
plot(lyricssentiment$Year, lyricssentiment$n, main = "Net Sentiment Score by Year of Grammy Nominated Records from 1980 - 2019 with Linear Model Fit", ylab = "Net Sentiment", xlab = "Year")
```


```{r}
# draw a graph that can show Net Sentiment Score by Year of Grammy Nominated Records from 1980 - 2019 with Linear Model Fit
# Using ggplot package to run the ggplot(), ggtitle(), geom_smooth(), xlab() and ylab() function
ggplot(lyricssentiment, aes(x = Year, y = n, color = decade)) + geom_point() + geom_smooth() +
  ggtitle("Net Sentiment Score by Year of Grammy Nominated Records from 1980 - 2019 with Linear Model Fit") +
  xlab("Year") +
  ylab("Net Sentiment") +
  geom_smooth(aes(x = Year, y = n), method = "lm", se = FALSE, inherit.aes = FALSE, colour = "black", size = 1) 
```

From the sixth graph, we can know the net sentiment score by year of Grammy Nominated Records from 1980 - 2019. By looking the graph, we can easily to know how the net sentiment change in different years, and by looking the fit line, we can know that the net sentiment has become higher from 2010s.




